From a39d5f2ffaf9fa1185cbf34024fec97005407f8d Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 12 Aug 2024 13:12:42 -0700 Subject: [PATCH 1/3] frame-level get and set interface --- src/nested_pandas/nestedframe/core.py | 42 +++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index aa15401..0b98f99 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -65,6 +65,48 @@ def _is_known_column(self, colname) -> bool: """Determine whether a string is a known column name""" return colname in self.columns or self._is_known_hierarchical_column(colname) + def __getitem__(self, item): + """Adds custom __getitem__ behavior for nested columns""" + + # If a nested column name is passed, return a flat series for that column + # flat series is chosen over list series for utility + # e.g. native ability to do something like ndf["nested.a"] + 3 + if isinstance(item, str) and self._is_known_hierarchical_column(item): + nested, col = item.split(".") + return self[nested].nest.get_flat_series(col) + # Otherwise, do __getitem__ as normal + else: + return super().__getitem__(item) + + def __setitem__(self, key, value): + """Adds custom __setitem__ behavior for nested columns""" + + # Replacing or adding columns to a nested structure + # Allows statements like ndf["nested.t"] = ndf["nested.t"] - 5 + # Or ndf["nested.base_t"] = ndf["nested.t"] - 5 + # Performance note: This requires building a new nested structure + # TODO: Support assignment of a new column to an existing nested col from a list series + if self._is_known_hierarchical_column(key) or ( + "." in key and key.split(".")[0] in self.nested_columns + ): + nested, col = key.split(".") + new_flat = self[nested].nest.to_flat() + new_flat[col] = value + packed = packer.pack(new_flat) + return super().__setitem__(nested, packed) + + # Adding a new nested structure from a column + # Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5 + elif "." in key: + new_nested, col = key.split(".") + if isinstance(value, pd.Series): + print("here") + value = value.to_frame() + packed = packer.pack(value) + return super().__setitem__(new_nested, packed) + + return super().__setitem__(key, value) + def add_nested( self, obj, From 66f848b35d663fbaefc99ef0a793ade460931fcf Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 12 Aug 2024 14:08:30 -0700 Subject: [PATCH 2/3] add tests for getitem and setitem --- src/nested_pandas/nestedframe/core.py | 2 +- .../nestedframe/test_nestedframe.py | 71 +++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index 0b98f99..a7ec7dc 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -100,7 +100,7 @@ def __setitem__(self, key, value): elif "." in key: new_nested, col = key.split(".") if isinstance(value, pd.Series): - print("here") + value.name = col value = value.to_frame() packed = packer.pack(value) return super().__setitem__(new_nested, packed) diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 4b7c010..3dbf34b 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -65,6 +65,77 @@ def test_is_known_hierarchical_column(): assert not base._is_known_hierarchical_column("base.a") +def test_get_nested_column(): + """Test that __getitem__ can retrieve a nested column""" + + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested = pd.DataFrame( + data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + base = base.add_nested(nested, "nested") + + base_c = base["nested.c"] + + # check basic properties + assert isinstance(base_c, pd.Series) + assert np.array_equal(np.array([0, 2, 4, 1, 4, 3, 1, 4, 1]), base_c.values.to_numpy()) + + +def test_set_or_replace_nested_col(): + """Test that __setitem__ can set or replace a column in a existing nested structure""" + + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + c = [0, 2, 4, 1, 4, 3, 1, 4, 1] + nested = pd.DataFrame( + data={"c": c, "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + base = base.add_nested(nested, "nested") + + # test direct replacement + base["nested.c"] = base["nested.c"] + 1 + assert np.array_equal(np.array(c) + 1, base["nested.c"].values.to_numpy()) + + # test += syntax + base["nested.c"] += 1 + assert np.array_equal( + np.array(c) + 2, # 2 now, chained from above + base["nested.c"].values.to_numpy(), + ) + + # test new column assignment + base["nested.e"] = base["nested.d"] * 2 + + assert "e" in base.nested.nest.fields + assert np.array_equal(base["nested.d"].values.to_numpy() * 2, base["nested.e"].values.to_numpy()) + + +def test_set_new_nested_col(): + """Test that __setitem__ can create a new nested structure""" + base = NestedFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + c = [0, 2, 4, 1, 4, 3, 1, 4, 1] + nested = pd.DataFrame( + data={"c": c, "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + base = base.add_nested(nested, "nested") + + # assign column cd in new_nested from c+d in nested + base["new_nested.cd"] = base["nested.c"] + base["nested.d"] + + assert "new_nested" in base.nested_columns + assert "cd" in base["new_nested"].nest.fields + + assert np.array_equal( + base["new_nested.cd"].values.to_numpy(), + base["nested.c"].values.to_numpy() + base["nested.d"].values.to_numpy(), + ) + + def test_add_nested_with_flat_df(): """Test that add_nested correctly adds a nested column to the base df""" From 306aa389d0df81ded1613296db81b0bd2439fa41 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Mon, 12 Aug 2024 14:48:37 -0700 Subject: [PATCH 3/3] add tutorial --- docs/tutorials.rst | 1 + docs/tutorials/data_manipulation.ipynb | 207 +++++++++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 docs/tutorials/data_manipulation.ipynb diff --git a/docs/tutorials.rst b/docs/tutorials.rst index 787a425..7e34de4 100644 --- a/docs/tutorials.rst +++ b/docs/tutorials.rst @@ -4,5 +4,6 @@ Tutorials .. toctree:: Loading Data into Nested-Pandas + Fine Data Manipulation with Nested-Pandas Lower-level interfaces Using Nested-Pandas with Astronomical Spectra diff --git a/docs/tutorials/data_manipulation.ipynb b/docs/tutorials/data_manipulation.ipynb new file mode 100644 index 0000000..941de7a --- /dev/null +++ b/docs/tutorials/data_manipulation.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Fine Data Manipulation with Nested-Pandas" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This tutorial will briefly showcase how one would perform data manipulation operations from `pandas`, like adding columns, replacing values, etc. with `nested-pandas`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import nested_pandas as npd\n", + "from nested_pandas.datasets import generate_data\n", + "\n", + "# Begin by generating an example dataset\n", + "ndf = generate_data(5, 20, seed=1)\n", + "ndf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show one of the nested dataframes\n", + "ndf.iloc[0].nested" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Nested Column Selection" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First, we can directly fetch a column from our nested column (aptly called \"nested\"). For example, below we can fetch the time column, \"t\", by specifying `\"nested.t\"` as the column to retrieve. This returns a \"flat\" view of the nested t column, where all rows from all dataframes are present in one dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Directly Nested Column Selection\n", + "ndf[\"nested.t\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The advantage of the flat view being that this is easily manipulatable just as any `pandas.Series` object. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ndf[\"nested.t\"] + 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding or Replacing Nested Columns" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "> *A Note on Performance: These operations involve full reconstruction of the nested columns so expect impacted performance when doing this at scale. It may be appropriate to do these operations within reduce functions directly (e.g. subtracting a value from a column) if performance is key.*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can use the \"base_column.nested_sub_column\" syntax to also perform operations that add new columns or replace existing columns for a nested column. For example, we can directly replace the \"band\" column with a new column that appends an additional string to the values." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# prepend lsst_ to the band column\n", + "\n", + "ndf[\"nested.band\"] = \"lsst_\" + ndf[\"nested.band\"]\n", + "\n", + "ndf[\"nested.band\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we can create a new column in the \"nested\" column. For example, we can subtract a value from each time value and return it as a new column." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# create a new \"corrected_t\" column in \"nested\"\n", + "\n", + "ndf[\"nested.corrected_t\"] = ndf[\"nested.t\"] - 5\n", + "\n", + "ndf[\"nested.corrected_t\"]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show the first dataframe again\n", + "ndf.iloc[0].nested" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Adding New Nested Structures" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, we can also add entirely new nested structures using the above syntax." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ndf[\"bands.band_label\"] = ndf[\"nested.band\"]\n", + "ndf" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is functionally equivalent to using `add_nested`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ndf.add_nested(ndf[\"nested.band\"].to_frame(), \"bands_from_add_nested\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}