Fix reading out arrays from database storage (#1010)

cjboyle · ZohebShaikh · danielballan · web-flow · commit 3e7212c1d726 · 2025-08-04T11:52:19.000-04:00
* check for array of arrays and convert to ndarray

* Add missing asserts

* Update xdi test

* Update regular expression slightly

* Add length check to ensure content but not order

* Use \s in re

* Add type ignore to expected warnings

* Add assert for composite test

* Add workaround for tests

* add tests for nested arrays where outer dtype is "object"

* Add explicit support for array types in Postgres and DuckDB

* clean up duplicate tests and fixtures

* fix testing on py3.9 due to missing numpy.unstack

* Update CHANGELOG.md

* remove duplicated functionality

* revert test to main

* Update check

* Add change log

* use RNG and fix tests comparing wrong expected+actual pairs

* remove unused sqlite fixtures

---------

Co-authored-by: Zoheb Shaikh &lt;zoheb.shaikh@diamond.ac.uk&gt;
Co-authored-by: Dan Allan &lt;dallan@bnl.gov&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,12 @@ Write the date in place of the "Unreleased" in the case a new version is release
 
 # Changelog
 
+## v0.1.0-b32 (Unreleased)
+
+### Fixed
+
+- Uniform array columns read from Postgres/DuckDB are now aggregated to an
+  NDArray (e.g. scanned `waveform` PVs)
 
 ## v0.1.0-b32 (2025-08-04)
 
diff --git a/tiled/_tests/adapters/test_sql_arrays.py b/tiled/_tests/adapters/test_sql_arrays.py
@@ -0,0 +1,256 @@
+from typing import Callable, cast
+
+import numpy as np
+import pyarrow as pa
+import pytest
+
+from tiled._tests.adapters.test_sql import adapter_duckdb_many_partitions  # noqa: F401
+from tiled._tests.adapters.test_sql import adapter_duckdb_one_partition  # noqa: F401
+from tiled._tests.adapters.test_sql import adapter_psql_many_partitions  # noqa: F401
+from tiled._tests.adapters.test_sql import adapter_psql_one_partition  # noqa: F401
+from tiled._tests.adapters.test_sql import assert_same_rows
+from tiled.adapters.sql import SQLAdapter
+from tiled.storage import SQLStorage, parse_storage, register_storage
+from tiled.structures.core import StructureFamily
+from tiled.structures.data_source import DataSource, Management
+from tiled.structures.table import TableStructure
+
+rng = np.random.default_rng(42)
+
+names = ["i0", "i1", "i2", "i3", "f4", "f5"]
+batch_size = 5
+data0 = [
+    pa.array(
+        [rng.integers(-100, 100, size=10, dtype=np.int8) for _ in range(batch_size)]
+    ),
+    pa.array(
+        [rng.integers(-100, 100, size=11, dtype=np.int16) for _ in range(batch_size)]
+    ),
+    pa.array(
+        [rng.integers(-100, 100, size=12, dtype=np.int32) for _ in range(batch_size)]
+    ),
+    pa.array(
+        [rng.integers(-100, 100, size=13, dtype=np.int64) for _ in range(batch_size)]
+    ),
+    pa.array([rng.random(size=14, dtype=np.float32) for _ in range(batch_size)]),
+    pa.array([rng.random(size=15, dtype=np.float64) for _ in range(batch_size)]),
+]
+batch_size = 8
+data1 = [
+    pa.array(
+        [rng.integers(-100, 100, size=10, dtype=np.int8) for _ in range(batch_size)]
+    ),
+    pa.array(
+        [rng.integers(-100, 100, size=11, dtype=np.int16) for _ in range(batch_size)]
+    ),
+    pa.array(
+        [rng.integers(-100, 100, size=12, dtype=np.int32) for _ in range(batch_size)]
+    ),
+    pa.array(
+        [rng.integers(-100, 100, size=13, dtype=np.int64) for _ in range(batch_size)]
+    ),
+    pa.array([rng.random(size=14, dtype=np.float32) for _ in range(batch_size)]),
+    pa.array([rng.random(size=15, dtype=np.float64) for _ in range(batch_size)]),
+]
+batch_size = 3
+data2 = [
+    pa.array(
+        [rng.integers(-100, 100, size=10, dtype=np.int8) for _ in range(batch_size)]
+    ),
+    pa.array(
+        [rng.integers(-100, 100, size=11, dtype=np.int16) for _ in range(batch_size)]
+    ),
+    pa.array(
+        [rng.integers(-100, 100, size=12, dtype=np.int32) for _ in range(batch_size)]
+    ),
+    pa.array(
+        [rng.integers(-100, 100, size=13, dtype=np.int64) for _ in range(batch_size)]
+    ),
+    pa.array([rng.random(size=14, dtype=np.float32) for _ in range(batch_size)]),
+    pa.array([rng.random(size=15, dtype=np.float64) for _ in range(batch_size)]),
+]
+
+batch0 = pa.record_batch(data0, names=names)
+batch1 = pa.record_batch(data1, names=names)
+batch2 = pa.record_batch(data2, names=names)
+
+
+@pytest.fixture
+def data_source_from_init_storage() -> Callable[[str, int], DataSource[TableStructure]]:
+    def _data_source_from_init_storage(
+        data_uri: str, num_partitions: int
+    ) -> DataSource[TableStructure]:
+        table = pa.Table.from_arrays(data0, names)
+        structure = TableStructure.from_arrow_table(table, npartitions=num_partitions)
+        data_source = DataSource(
+            management=Management.writable,
+            mimetype="application/x-tiled-sql-table",
+            structure_family=StructureFamily.table,
+            structure=structure,
+            assets=[],
+        )
+
+        storage = cast(SQLStorage, parse_storage(data_uri))
+        register_storage(storage)
+        return SQLAdapter.init_storage(data_source=data_source, storage=storage)
+
+    return _data_source_from_init_storage
+
+
+@pytest.mark.parametrize(
+    "adapter_name", [("adapter_duckdb_one_partition"), ("adapter_psql_one_partition")]
+)
+def test_write_read_one_batch_one_part(
+    adapter_name: str, request: pytest.FixtureRequest
+) -> None:
+    # get adapter from fixture
+    adapter: SQLAdapter = request.getfixturevalue(adapter_name)
+
+    # test appending and reading a table as a whole
+    test_table = pa.Table.from_arrays(data0, names)
+
+    adapter.append_partition(batch0, 0)
+    result_read = adapter.read()
+    assert test_table == pa.Table.from_pandas(result_read)
+
+    # test appending and reading a partition in a table
+    result_read_partition = adapter.read_partition(0)
+    assert test_table == pa.Table.from_pandas(result_read_partition)
+
+
+@pytest.mark.parametrize(
+    "adapter_name", [("adapter_duckdb_one_partition"), ("adapter_psql_one_partition")]
+)
+def test_write_read_list_batch_one_part(
+    adapter_name: str, request: pytest.FixtureRequest
+) -> None:
+    # get adapter from fixture
+    adapter: SQLAdapter = request.getfixturevalue(adapter_name)
+
+    test_table = pa.Table.from_batches([batch0, batch1, batch2])
+    # test appending a list of batches to a table and read as a whole
+    adapter.append_partition([batch0, batch1, batch2], 0)
+    result_read = adapter.read()
+
+    assert test_table == pa.Table.from_pandas(result_read)
+
+    # test appending and reading a partition in a table
+    result_read_partition = adapter.read_partition(0)
+
+    assert test_table == pa.Table.from_pandas(result_read_partition)
+
+    # test appending few more times done correctly
+    test_table = pa.Table.from_batches(
+        [batch0, batch1, batch2, batch2, batch0, batch1, batch1, batch2, batch0]
+    )
+    adapter.append_partition([batch2, batch0, batch1], 0)
+    adapter.append_partition([batch1, batch2, batch0], 0)
+    result_read = adapter.read()
+
+    assert test_table == pa.Table.from_pandas(result_read)
+
+    # test appending a few times and reading done correctly
+    result_read_partition = adapter.read_partition(0)
+
+    assert test_table == pa.Table.from_pandas(result_read_partition)
+
+
+@pytest.mark.parametrize(
+    "adapter_name",
+    [("adapter_duckdb_many_partitions"), ("adapter_psql_many_partitions")],
+)
+def test_append_single_partition(
+    adapter_name: str, request: pytest.FixtureRequest
+) -> None:
+    # get adapter from fixture
+    adapter: SQLAdapter = request.getfixturevalue(adapter_name)
+
+    # test writing an entire pyarrow table to a single partition
+    table = pa.Table.from_batches([batch0, batch1, batch2])
+    adapter.append_partition(table, 0)
+
+    result_read = adapter.read()
+    assert table == pa.Table.from_pandas(result_read)
+
+    # test reading a specific partition
+    result_read_partition = adapter.read_partition(0)
+    assert table == pa.Table.from_pandas(result_read_partition)
+
+
+@pytest.mark.parametrize("adapter_name", [("adapter_psql_many_partitions")])
+@pytest.mark.parametrize("field", names)
+def test_write_read_one_batch_many_part(
+    adapter_name: str, request: pytest.FixtureRequest, field: str
+) -> None:
+    # get adapter from fixture
+    adapter: SQLAdapter = request.getfixturevalue(adapter_name)
+
+    # test writing to many partitions and reading it whole
+    adapter.append_partition(batch0, 0)
+    adapter.append_partition(batch1, 1)
+    adapter.append_partition(batch2, 2)
+
+    result_read = adapter.read()
+
+    assert pa.Table.from_batches([batch0, batch1, batch2]) == pa.Table.from_pandas(
+        result_read
+    )
+
+    # test reading a specific partition
+    result_read_partition = adapter.read_partition(0)
+    assert pa.Table.from_arrays(data0, names) == pa.Table.from_pandas(
+        result_read_partition
+    )
+
+    result_read_partition = adapter.read_partition(1)
+    assert pa.Table.from_arrays(data1, names) == pa.Table.from_pandas(
+        result_read_partition
+    )
+
+    result_read_partition = adapter.read_partition(2)
+    assert pa.Table.from_arrays(data2, names) == pa.Table.from_pandas(
+        result_read_partition
+    )
+
+    # test appending a few times and reading done correctly
+    adapter.append_partition(batch0, 1)
+    adapter.append_partition(batch1, 2)
+    adapter.append_partition(batch2, 0)
+
+    result_read = adapter.read()
+
+    # Check that each partition matches
+    assert_same_rows(
+        pa.Table.from_batches([batch0, batch2]),
+        pa.Table.from_pandas(adapter.read_partition(0)),
+    )
+    assert_same_rows(
+        pa.Table.from_batches([batch1, batch0]),
+        pa.Table.from_pandas(adapter.read_partition(1)),
+    )
+    assert_same_rows(
+        pa.Table.from_batches([batch2, batch1]),
+        pa.Table.from_pandas(adapter.read_partition(2)),
+    )
+    assert_same_rows(
+        pa.Table.from_batches([batch0, batch2, batch1, batch0, batch2, batch1]),
+        pa.Table.from_pandas(result_read),
+    )
+
+    # read a specific field
+    result_read = adapter.read_partition(0, fields=[field])
+    field_index = names.index(field)
+    assert np.array_equal(
+        [*data0[field_index].tolist(), *data2[field_index].tolist()],
+        result_read[field].tolist(),
+    )
+    result_read = adapter.read_partition(1, fields=[field])
+    assert np.array_equal(
+        [*data1[field_index].tolist(), *data0[field_index].tolist()],
+        result_read[field].tolist(),
+    )
+    result_read = adapter.read_partition(2, fields=[field])
+    assert np.array_equal(
+        [*data2[field_index].tolist(), *data1[field_index].tolist()],
+        result_read[field].tolist(),
+    )
diff --git a/tiled/_tests/test_array.py b/tiled/_tests/test_array.py
@@ -65,6 +65,20 @@
         }
     )
 
+nd_array = numpy.arange(9).reshape((3, 3))
+uniform_array = numpy.empty((3,), dtype=object)
+for i in range(uniform_array.shape[0]):
+    uniform_array[i] = nd_array[i]
+ragged_array = numpy.array([numpy.arange(3), numpy.arange(4, 10)], dtype=object)
+object_array = numpy.full((10,), {"a": 1}, dtype=object)
+nested_arrays_tree = MapAdapter(
+    {
+        "uniform": ArrayAdapter.from_array(uniform_array),
+        "ragged": ArrayAdapter.from_array(ragged_array),
+        "objects": ArrayAdapter.from_array(object_array),
+    }
+)
+
 
 @pytest.fixture(scope="module")
 def context():
@@ -75,6 +89,7 @@ def context():
             "inf": inf_tree,
             "scalar": scalar_tree,
             "zero": zero_tree,
+            "nested_arrays": nested_arrays_tree,
         }
     )
     app = build_app(tree)
@@ -166,6 +181,25 @@ def test_array_interface(context):
         v.dims
 
 
+def test_uniform_nested_array_projected_to_ndarray(context):
+    client = from_context(context)["nested_arrays"]["uniform"]
+    assert client.dtype == numpy.int_
+    assert client.read().dtype == numpy.int_
+    assert numpy.array_equal(client.read(), nd_array)
+
+
+@pytest.mark.parametrize("kind", ["ragged", "objects"])
+def test_unparsable_nested_array_stringified(kind, context):
+    # This behavior is due to the fact that ragged Numpy arrays, and those with
+    # non-numeric types (except for strings) will likely have dtype=object,
+    # which may not be parsable or reducible. As such we fallback to taking the
+    # string representations of the array elements.
+    client = from_context(context)["nested_arrays"][kind]
+    assert "<U" in client.dtype.str
+    assert "<U" in client.read().dtype.str
+    assert isinstance(client[0], str)
+
+
 @pytest.mark.parametrize("kind", list(array_cases))
 def test_as_buffer(kind):
     output = as_buffer(array_cases[kind], {})
diff --git a/tiled/adapters/array.py b/tiled/adapters/array.py
@@ -1,3 +1,4 @@
+import contextlib
 from typing import Any, List, Optional, Set, Tuple
 
 import dask.array
@@ -82,6 +83,17 @@ def from_array(
         if not hasattr(array, "__array__"):
             array = numpy.asanyarray(array)
 
+        # Convert array of arrays to ND array to expose the underlying dtype
+        is_array_of_arrays = (
+            array.dtype == "object"
+            and array.shape[0]
+            and isinstance(array[0], numpy.ndarray)
+        )
+        if is_array_of_arrays:
+            with contextlib.suppress(ValueError):
+                # only uniform arrays (with same dimensions) are stackable
+                array = numpy.vstack(array)
+
         # Convert (experimental) pandas.StringDtype to numpy's unicode string dtype
         is_likely_string_dtype = isinstance(array.dtype, pandas.StringDtype) or (
             array.dtype == "object" and array.dtype.fields is None